Introduction

tidybulk integrates several methods for differential transcript abundance testing.

Comparison of methods

We can serially perform differential analyses with several methods, and the results will be added to the original dataset.

We first pre-process the data, creating a tibble and identifying abundant genes.

pasilla_de <- 
  rpharma2020tidytranscriptomics::pasilla %>% 
  
  # Convert SE object to tibble
  tidybulk %>%
  
   # Scale abundance for plotting
  identify_abundant(factor_of_interest=condition) 

This is an example for the default method for differential transcriptional testing. It uses the edgeR method.

pasilla_de %>%
  
  # Test differential composition
  test_differential_abundance(
    ~ condition + type, 
    action="get"
  ) %>%
  arrange(FDR)
#> tidybulk says: All methods use raw counts, 
#> irrespective of if scale_abundance or adjust_abundance have been calculated, 
#> therefore it is essential to add covariates such as batch effects (if applicable) in the formula.
#> tidybulk says: The design column names are "(Intercept), conditionuntreated, typesingle_end"
#> tidybulk says: to access the raw results (fitted GLM) do `attr(..., "internals")$edgeR`
#> # A tibble: 14,599 x 7
#>    feature     .abundant logFC logCPM     F   PValue          FDR
#>    <chr>       <lgl>     <dbl>  <dbl> <dbl>    <dbl>        <dbl>
#>  1 FBgn0025111 TRUE      -2.86   6.92 1304. 2.37e-12 0.0000000149
#>  2 FBgn0039155 TRUE       4.61   5.88 1194. 3.75e-12 0.0000000149
#>  3 FBgn0003360 TRUE       3.12   8.45  792. 3.17e-11 0.0000000837
#>  4 FBgn0035085 TRUE       2.57   5.68  728. 4.89e-11 0.0000000968
#>  5 FBgn0029167 TRUE       2.19   8.22  599. 1.34e-10 0.000000169 
#>  6 FBgn0034736 TRUE       3.52   4.18  586. 1.50e-10 0.000000169 
#>  7 FBgn0039827 TRUE       4.17   4.39  616. 1.16e-10 0.000000169 
#>  8 FBgn0026562 TRUE       2.47  11.8   504. 3.27e-10 0.000000323 
#>  9 FBgn0000071 TRUE      -2.63   4.79  389. 1.24e- 9 0.000000983 
#> 10 FBgn0034434 TRUE       3.63   3.21  390. 1.22e- 9 0.000000983 
#> # … with 14,589 more rows

Now let’s try to perform multiple methods on the same dataset.

de_all <- 
  
  pasilla_de %>%
  
  # edgeR QLT
  test_differential_abundance(
    ~ condition + type, 
    method = "edger_quasi_likelihood",
    prefix = "edgerQLT_"
  )  %>%
  
  # edgeR LRT
  test_differential_abundance(
    ~ condition + type, 
    method = "edger_likelihood_ratio",
    prefix = "edgerLR_"
  )  %>%
  
  # limma-voom
  test_differential_abundance(
    ~ condition + type, 
    method = "limma_voom",
    prefix = "voom_"
  ) %>%
  
  # DESeq2
  test_differential_abundance(
    ~ condition + type, 
    method = "deseq2",
    prefix = "deseq2_"
  ) 

We can visually compare the log fold change (logFC) of transcript abundance for the comparison of interest (treated vs untreated) for all methods. We will notice that the consistency of the logFC is really high for the methods.

de_all %>%
  keep_abundant() %>%
  pivot_transcript() %>%
  select(edgerQLT_logFC, edgerLR_logFC, voom_logFC, deseq2_log2FoldChange, feature ) %>%
  ggpairs(1:4)

Similarly, we can visually compare the significance for all methods. In this case the difference is larger.

de_all %>%
  keep_abundant() %>%
  pivot_transcript() %>%
  select(edgerQLT_PValue, edgerLR_PValue, voom_P.Value, deseq2_pvalue, feature ) %>%
  ggpairs(1:4)

We can investigate some of the genes that are less consistent between methods, for example between edgeR QLT and DESeq2. In this case we can use plotly to produce an interactive plot.

# Produce the ggplot
(
  de_all %>%
    keep_abundant() %>%
    ggplot(aes(edgerQLT_PValue, deseq2_pvalue, label = feature)) + 
    geom_point()
) %>% 
  
  # Make it interactive
  ggplotly()

We can select some of the transcripts for further analysis using the tidygate package.

de_gate = 
  de_all %>%
  
  keep_abundant() %>%
  
  gate(
    feature,
    edgerQLT_PValue, 
    deseq2_pvalue, 
    opacity=0.3, 
    how_many_gates = 2 
  )
#> # A tibble: 55,433 x 29
#>    feature sample counts condition type  .abundant edgerQLT_logFC
#>    <chr>   <fct>   <int> <fct>     <fct> <lgl>              <dbl>
#>  1 FBgn00… untrt1     92 untreated sing… TRUE              0.0337
#>  2 FBgn00… untrt1   4664 untreated sing… TRUE              0.249 
#>  3 FBgn00… untrt1    583 untreated sing… TRUE              0.0569
#>  4 FBgn00… untrt1   1446 untreated sing… TRUE              0.0402
#>  5 FBgn00… untrt1     15 untreated sing… TRUE             -0.387 
#>  6 FBgn00… untrt1 101664 untreated sing… TRUE             -0.310 
#>  7 FBgn00… untrt1  33402 untreated sing… TRUE             -0.617 
#>  8 FBgn00… untrt1     21 untreated sing… TRUE             -0.491 
#>  9 FBgn00… untrt1     12 untreated sing… TRUE             -0.610 
#> 10 FBgn00… untrt1   3026 untreated sing… TRUE              0.163 
#> # … with 55,423 more rows, and 22 more variables: edgerQLT_logCPM <dbl>,
#> #   edgerQLT_F <dbl>, edgerQLT_PValue <dbl>, edgerQLT_FDR <dbl>,
#> #   edgerLR_logFC <dbl>, edgerLR_logCPM <dbl>, edgerLR_LR <dbl>,
#> #   edgerLR_PValue <dbl>, edgerLR_FDR <dbl>, voom_logFC <dbl>,
#> #   voom_AveExpr <dbl>, voom_t <dbl>, voom_P.Value <dbl>, voom_adj.P.Val <dbl>,
#> #   voom_B <dbl>, deseq2_baseMean <dbl>, deseq2_log2FoldChange <dbl>,
#> #   deseq2_lfcSE <dbl>, deseq2_stat <dbl>, deseq2_pvalue <dbl>,
#> #   deseq2_padj <dbl>, gate <chr>

We can now select the transcripts from the two gates (i.e. oversignificant in DESeq2 and oversignificant in edgeR)


de_gate %>% 
  scale_abundance() %>%
  
  # Filter only transcripts within the gates
  filter(gate > 0) %>% 
  
  # Rename for clarity
  mutate(gate = case_when(
    gate == 1 ~ "more in edgeR",
    gate == 2 ~ "more in DESeq2",
    TRUE ~ gate
  )) %>%
  
  # Rearrange order
  mutate(feature = fct_reorder(feature, edgerQLT_PValue, min)) %>%
  
  # Plot
  ggplot(aes(condition, counts_scaled, color=gate)) +
  geom_point() +
  facet_wrap(~feature, scale="free_y") +
  custom_theme

For example DESeq2 produces a more conservative statistic for the transcript FBgn0052939.

de_gate %>%
  pivot_transcript %>%
  filter(feature == "FBgn0052939")%>%
  select(edgerQLT_logFC, deseq2_log2FoldChange)
#> # A tibble: 1 x 2
#>   edgerQLT_logFC deseq2_log2FoldChange
#>            <dbl>                 <dbl>
#> 1           2.37                  1.65

Contributing

If you want to suggest improvements for this workshop or ask questions, you can do so as described here.

Reproducibility

Record package and version information with sessionInfo

sessionInfo()
#> R version 4.0.2 Patched (2020-09-24 r79253)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.1 LTS
#> 
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] plotly_4.9.2.1 GGally_2.0.0   ggplot2_3.3.2  tidygate_0.2.8 tidybulk_1.1.8
#> [6] forcats_0.5.0  dplyr_1.0.2   
#> 
#> loaded via a namespace (and not attached):
#>   [1] bitops_1.0-6                         matrixStats_0.57.0                  
#>   [3] fs_1.5.0                             bit64_4.0.5                         
#>   [5] RColorBrewer_1.1-2                   httr_1.4.2                          
#>   [7] rprojroot_1.3-2                      GenomeInfoDb_1.25.11                
#>   [9] tools_4.0.2                          backports_1.1.10                    
#>  [11] utf8_1.1.4                           R6_2.4.1                            
#>  [13] DBI_1.1.0                            lazyeval_0.2.2                      
#>  [15] BiocGenerics_0.35.4                  colorspace_1.4-1                    
#>  [17] withr_2.3.0                          tidyselect_1.1.0                    
#>  [19] gridExtra_2.3                        DESeq2_1.29.13                      
#>  [21] bit_4.0.4                            compiler_4.0.2                      
#>  [23] preprocessCore_1.51.0                cli_2.0.2                           
#>  [25] Biobase_2.49.1                       desc_1.2.0                          
#>  [27] DelayedArray_0.15.9                  labeling_0.3                        
#>  [29] scales_1.1.1                         readr_1.3.1                         
#>  [31] genefilter_1.71.0                    pkgdown_1.6.1                       
#>  [33] systemfonts_0.3.1                    stringr_1.4.0                       
#>  [35] digest_0.6.25                        rmarkdown_2.3                       
#>  [37] XVector_0.29.3                       pkgconfig_2.0.3                     
#>  [39] htmltools_0.5.0                      limma_3.45.14                       
#>  [41] htmlwidgets_1.5.1                    rlang_0.4.7                         
#>  [43] RSQLite_2.2.0                        farver_2.0.3                        
#>  [45] generics_0.0.2                       rpharma2020tidytranscriptomics_1.0.6
#>  [47] jsonlite_1.7.1                       crosstalk_1.1.0.1                   
#>  [49] BiocParallel_1.23.2                  RCurl_1.98-1.2                      
#>  [51] magrittr_1.5                         GenomeInfoDbData_1.2.3              
#>  [53] Matrix_1.2-18                        Rcpp_1.0.5                          
#>  [55] munsell_0.5.0                        S4Vectors_0.27.13                   
#>  [57] fansi_0.4.1                          viridis_0.5.1                       
#>  [59] lifecycle_0.2.0                      stringi_1.5.3                       
#>  [61] yaml_2.2.1                           edgeR_3.31.4                        
#>  [63] SummarizedExperiment_1.19.6          zlibbioc_1.35.0                     
#>  [65] plyr_1.8.6                           blob_1.2.1                          
#>  [67] grid_4.0.2                           parallel_4.0.2                      
#>  [69] crayon_1.3.4                         lattice_0.20-41                     
#>  [71] splines_4.0.2                        annotate_1.67.1                     
#>  [73] hms_0.5.3                            locfit_1.5-9.4                      
#>  [75] knitr_1.30                           pillar_1.4.6                        
#>  [77] GenomicRanges_1.41.6                 geneplotter_1.67.0                  
#>  [79] stats4_4.0.2                         XML_3.99-0.5                        
#>  [81] glue_1.4.2                           evaluate_0.14                       
#>  [83] data.table_1.13.0                    BiocManager_1.30.10                 
#>  [85] png_0.1-7                            vctrs_0.3.4                         
#>  [87] gtable_0.3.0                         purrr_0.3.4                         
#>  [89] tidyr_1.1.2                          reshape_0.8.8                       
#>  [91] assertthat_0.2.1                     cpp11_0.2.1                         
#>  [93] xfun_0.17                            xtable_1.8-4                        
#>  [95] survival_3.2-3                       ragg_0.3.1                          
#>  [97] viridisLite_0.3.0                    tibble_3.0.3                        
#>  [99] AnnotationDbi_1.51.3                 memoise_1.1.0                       
#> [101] IRanges_2.23.10                      ellipsis_0.3.1

References